--- title: Analyze keywords: fastai sidebar: home_sidebar summary: "This Notebook analyzes various concepts for anomaly detection" description: "This Notebook analyzes various concepts for anomaly detection" nb_path: "03_analyze.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
from anomaly.utils import *
from anomaly.binet import *

import pandas as pd
{% endraw %} {% raw %}
def _shift_columns (a,ws=3): return np.dstack(list(reversed([np.roll(a,i) for i in range(0,ws)])))[0]

def subsequences_fast(df,event_ids,ws=None,min_ws=64):
    max_trace_len=int(event_ids.max())+1

    if not ws: ws=max_trace_len-1
    elif ws <max_trace_len-1: raise ValueError(f"ws must be greater equal {max_trace_len-1}")
    pad=ws
    ws=max(min_ws,ws)
    trace_start = np.where(event_ids == 0)[0]
    trace_len=np.array([trace_start[i]-trace_start[i-1] for i in range(1,len(trace_start))]+[len(df)-trace_start[-1]])
    tmp=np.stack([_shift_columns(df[i],ws=ws) for i in list(df)])
    idx=[range(trace_start[i],trace_start[i]+trace_len[i]) for i in range(len(trace_start))]
    idx=np.array([y for x in idx for y in x])

    res=np.rollaxis(tmp,1)[idx]
    mask=ws-1-event_ids[idx][:,None] > np.arange(res.shape[2])
    res[np.broadcast_to(mask[:,None],res.shape)]=0
    return res[:-1],(idx+1)[:-1]
{% endraw %} {% raw %}
def _shift_columns (a,ws=3): return np.dstack(list(reversed([np.roll(a,i) for i in range(0,ws)])))[0]
def windows_fast(df,event_ids,ws=5,pad=None):
    max_trace_len=int(event_ids.max())+1
    trace_start = np.where(event_ids == 0)[0]
    trace_len=[trace_start[i]-trace_start[i-1] for i in range(1,len(trace_start))]+[len(df)-trace_start[-1]]
    idx=[range(trace_start[i]+(i+1)
               *(ws-1),trace_start[i]+trace_len[i]+(i+1)*(ws-1)-1) for i in range(len(trace_start))]
    idx=np.array([y for x in idx for y in x])
    trace_start = np.repeat(trace_start, ws-1)
    tmp=np.stack([_shift_columns(np.insert(np.array(df[i]), trace_start, 0, axis=0),ws=ws) for i in list(df)]) 
    tmp=np.rollaxis(tmp,1) 
    res=tmp[idx]
    if pad: res=np.pad(res,((0,0),(0,0),(pad-ws,0))) 
    
    return res,np.where(event_ids != 0)[0]
{% endraw %} {% raw %}
class TestModel(nn.Module):
    def __init__(self, pp_data ,is_cuda=False,vocab_col='activity'):
        super().__init__()
        vocab_size=len(pp_data.procs.categorify[vocab_col])
        self.vocab_index={s:i for i,s in enumerate(pp_data.cat_names[0])}[vocab_col]
        n_fac, n_hidden=round(sqrt(vocab_size))+1, round(sqrt(vocab_size)*2)
        self.n_hidden=n_hidden
        self.is_cuda=is_cuda
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_bottleneck = nn.Linear(n_hidden, 2)
        self.l_out = nn.Linear(2, vocab_size)
        
    def forward(self, xb):
        cs=xb.permute(1,2,0)[self.vocab_index]
        bs = len(cs[0])
        h = torch.zeros((bs,self.n_hidden))
        if self.is_cuda: h=h.cuda()
        for c in cs:
            inp = torch.relu(self.l_in(self.e(c)))
            h = torch.tanh(self.l_hidden(h+inp))
        h = self.l_bottleneck(h)
        return F.log_softmax(self.l_out(h),dim=0)
{% endraw %} {% raw %}
class Camargo_specialized_bottleneck(torch.nn.Module) :
    def __init__(self, o) :
        super().__init__()
        hidden=25
        vocab_act=len(o.procs.categorify['activity'])
        emb_dim_act = int(sqrt(vocab_act))+1

        self.emb_act = nn.Embedding(vocab_act,emb_dim_act)
        
        self.lstm_act = nn.LSTM(emb_dim_act, hidden, batch_first=True, num_layers=2)
        
        self.l_bottleneck = nn.Linear(hidden, 2)

        self.linear_act = nn.Linear(2, vocab_act)

    def forward(self, xcat):
        x_act=xcat
        x_act = self.emb_act(x_act)
        x_act,_ = self.lstm_act(x_act)
        
        x_act = x_act[:,-1]
        x_act = self.l_bottleneck(x_act)
        x_act = self.linear_act(x_act)

        return x_act
{% endraw %} {% raw %}
class Camargo_specialized(torch.nn.Module) :
    def __init__(self, o) :
        super().__init__()
        hidden=25
        vocab_act=len(o.procs.categorify['activity'])
        emb_dim_act = int(sqrt(vocab_act))+1

        self.emb_act = nn.Embedding(vocab_act,emb_dim_act)
        
        self.lstm_act = nn.LSTM(emb_dim_act, hidden, batch_first=True, num_layers=2)

        self.linear_act = nn.Linear(hidden, vocab_act)

    def forward(self, xcat):
        x_act=xcat
        x_act = self.emb_act(x_act)
        x_act,_ = self.lstm_act(x_act)
        x_act = x_act[:,-1]
        x_act = self.linear_act(x_act)
        #x_act = F.softmax(x_act,dim=1) 
        #very bad for anomaly detection tasks and in general for tasks where multiple outcomes are resonable
        
        return x_act
{% endraw %} {% raw %}
event_df, test_df, df_truth = load_data(data='PDC2020')
{% endraw %} {% raw %}
trace_df = pd.DataFrame(index= event_df['trace_id'].unique())
{% endraw %} {% raw %}
log = event_df
{% endraw %} {% raw %}
cols,outcome='activity',False
o=PPObj(log,procs=Categorify(),cat_names=cols,y_names=cols,splits=split_traces(log))
dls=o.get_dls(outcome=outcome,windows=partial(windows_fast))
m=Camargo_specialized_bottleneck(o)
train_validate(dls,m,epoch=10)
epoch train_loss valid_loss accuracy time
0 2.169466 1.859249 0.386568 00:03
1 1.768058 1.670261 0.418509 00:03
2 1.714815 1.635697 0.461916 00:03
3 1.654079 1.578007 0.470106 00:03
4 1.600725 1.547093 0.469287 00:03
5 1.621557 1.509645 0.499590 00:03
6 1.545782 1.473332 0.520066 00:03
7 1.535814 1.469299 0.515152 00:03
8 1.494484 1.455103 0.525798 00:03
9 1.461321 1.454333 0.525798 00:03
Better model found at epoch 0 with valid_loss value: 1.859249472618103.
Better model found at epoch 1 with valid_loss value: 1.6702611446380615.
Better model found at epoch 2 with valid_loss value: 1.6356970071792603.
Better model found at epoch 3 with valid_loss value: 1.5780072212219238.
Better model found at epoch 4 with valid_loss value: 1.5470929145812988.
Better model found at epoch 5 with valid_loss value: 1.5096454620361328.
Better model found at epoch 6 with valid_loss value: 1.4733315706253052.
Better model found at epoch 7 with valid_loss value: 1.4692987203598022.
Better model found at epoch 8 with valid_loss value: 1.4551030397415161.
Better model found at epoch 9 with valid_loss value: 1.4543331861495972.
Better model found at epoch 0 with valid_loss value: 0.48875856399536133.
0.48875856399536133
{% endraw %} {% raw %}
o=PPObj(log,procs=Categorify(),cat_names=cols,y_names=cols)
o.items
event_id activity trace_id
trace_id
1 0 2 1
1 1 3 1
1 2 4 1
1 3 7 1
1 4 5 1
... ... ... ...
1000 12 19 1000
1000 13 16 1000
1000 14 18 1000
1000 15 20 1000
1000 16 1 1000

16450 rows × 3 columns

{% endraw %} {% raw %}
wds,idx=windows_fast(o.xs, o.event_ids)
y = o.items['activity'].iloc[idx].values


res=(m(LongTensor(wds.squeeze()).cuda()))
{% endraw %} {% raw %}
wds
array([[[ 0,  0,  0,  0,  2]],

       [[ 0,  0,  0,  2,  3]],

       [[ 0,  0,  2,  3,  4]],

       ...,

       [[14, 10, 17, 19, 16]],

       [[10, 17, 19, 16, 18]],

       [[17, 19, 16, 18, 20]]], dtype=int8)
{% endraw %} {% raw %}
len(res), len(wds)
(15450, 15450)
{% endraw %} {% raw %}
event_df.reset_index(drop=True, inplace=True)
event_df.drop(event_df[event_df.event_id == 0].index, inplace=True)
event_df.index = event_df['trace_id']
event_df
event_id activity trace_id
trace_id
1 1 t11 1
1 2 t21 1
1 3 t35 1
1 4 t26 1
1 5 t41 1
... ... ... ...
1000 12 t82 1000
1000 13 t71 1000
1000 14 t81 1000
1000 15 t91 1000
1000 16 end 1000

15450 rows × 3 columns

{% endraw %} {% raw %}
res.shape
torch.Size([15450, 21])
{% endraw %} {% raw %}
y.shape
(15450,)
{% endraw %} {% raw %}
len(o.items.index)
16450
{% endraw %} {% raw %}
a = AnomalyDetection(res, y, event_df, df_truth.loc[df_truth['normal']== False],binet=True)
a(threshold='gmean', analyze=True,s=20)
Number of Traces Number of Anomalies Classified as Anomalies Correct Classified F1 Score best Threshold Mean max Fscore
0 1000 456 412 316 0.728111 0.9 0.51555 0.728111
{% endraw %} {% raw %}
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook
{% endraw %} {% raw %}
m.l_bottleneck.register_forward_hook(get_activation('self.l_bottleneck'))
output = (m(LongTensor(wds.squeeze()).cuda()))
{% endraw %} {% raw %}
import plotly.graph_objects as go

x1 = activation['self.l_bottleneck'][:,0].cpu()
y1 = activation['self.l_bottleneck'][:,1].cpu()

fig = go.Figure(data=go.Scattergl(
    x = x1,
    y = y1,
    mode='markers',

))

fig.show()
{% endraw %} {% raw %}
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.3, min_samples=10).fit(activation['self.l_bottleneck'].cpu())

unique_labels= set(clustering.labels_)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
color_indices = clustering.labels_
colormap = matplotlib.colors.ListedColormap(colors)
colormap.colors[0] = (0.,0.,0.,1.)
{% endraw %} {% raw %}
import plotly.express as px
df_plot = pd.DataFrame({'x': x1, 'y': y1, 'c': color_indices})
df_plot["c"] = df_plot["c"].astype(str)
fig = px.scatter(df_plot, x="x", y="y", color="c")
fig.show()
{% endraw %} {% raw %}
anomalies = np.where(clustering.labels_ == -1)[0].tolist()
len(anomalies)
251
{% endraw %} {% raw %}
anomalies = event_df.iloc[anomalies]['trace_id'].unique()
len(anomalies)
196
{% endraw %} {% raw %}
truth = df_truth.loc[df_truth['normal']==False]['case'].unique()
{% endraw %} {% raw %}
len(set(truth).intersection(set(anomalies)))
165
{% endraw %} {% raw %}
f1score(truth, anomalies)
(0.5061349693251533, 165, 456, 196)
{% endraw %}